import pandas as pd
import numpy as np
import dalex as dx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
import joblib
plt.rcParams['figure.figsize'] = (16,6)
sns.set(font_scale = 1.2)
sns.set_style("ticks",{'axes.grid' : True})
sns.set_palette("deep")
dla zadania klasyfikacji - airline passenger satisfaction
df_train = pd.read_csv('../data/airline_passenger_satisfaction/train.csv', index_col=0)
df_test = pd.read_csv('../data/airline_passenger_satisfaction/test.csv', index_col=0)
idGender, Customer Type, Type of Travel, satisfaction - sprowadzenie do postaci 0/1ClassArrival Delay in Minutes - zastąpienie braków danych medianą (ze względu na wartości odstające)Departure/Arrival Delay in Minutesclass Transformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.adim_median = None
def fit(self, X, y=None):
self.adim_median = X['Arrival Delay in Minutes'].median()
return self
def transform(self, X):
tmp = X.copy()
# binarne -> 0/1
tmp['Gender'] = (tmp['Gender'] == 'Male').astype(int)
tmp['Customer Type'] = (tmp['Customer Type'] == 'Loyal Customer').astype(int)
tmp['Type of Travel'] = (tmp['Type of Travel'] == 'Business travel').astype(int)
tmp['satisfaction'] = (tmp['satisfaction'] == 'satisfied').astype(int)
# OneHot 'Class'
tmp = pd.concat([pd.get_dummies(tmp[["Class"]]), tmp], axis=1)
# 'Arrival' -> fill na
tmp.loc[tmp['Arrival Delay in Minutes'].isna(), 'Arrival Delay in Minutes'] = self.adim_median
# log
tmp['Departure Delay in Minutes'] = np.log1p(tmp['Departure Delay in Minutes'])
tmp['Arrival Delay in Minutes'] = np.log1p(tmp['Arrival Delay in Minutes'])
# Drop
tmp = tmp.drop(['id', 'Class'], axis=1)
return tmp
transformer = Transformer()
df_train = transformer.fit_transform(df_train)
df_test = transformer.transform(df_test)
y_train = df_train['satisfaction']
X_train = df_train.drop('satisfaction', axis = 1)
y_test = df_test['satisfaction']
X_test = df_test.drop('satisfaction', axis = 1)
Hiperparametry zostały dobrane na podstawie pd2.
clf = RandomForestClassifier(
n_estimators = 100,
max_depth = 15,
min_samples_split = 20,
max_features = "sqrt",
min_samples_leaf = 10,
random_state = 0
)
clf = clf.fit(X_train, y_train)
def report(X, y, title):
print(title)
y_pred = clf.predict(X)
print("Accuracy:".ljust(12), f"{accuracy_score(y, y_pred):.4f}")
print("F1-score:".ljust(12), f"{f1_score(y, y_pred):.4f}")
print("Precision:".ljust(12), f"{precision_score(y, y_pred):.4f}")
print("Recall:".ljust(12), f"{recall_score(y, y_pred):.4f}")
report(X_train, y_train, "Zbiór treningowy")
print()
report(X_test, y_test, "Zbiór testowy")
Zbiór treningowy Accuracy: 0.9597 F1-score: 0.9528 Precision: 0.9666 Recall: 0.9394 Zbiór testowy Accuracy: 0.9530 F1-score: 0.9458 Precision: 0.9580 Recall: 0.9340
Możemy uznać, że model jest całkiem dobrze wytrenowany.
Wykorzystamy do tego paczkę Dalex.
exp = dx.Explainer(clf, X_test, y_test)
Preparation of a new explainer is initiated -> data : 25976 rows 24 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 25976 values -> model_class : sklearn.ensemble._forest.RandomForestClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x0000011369ACBCA0> will be used (default) -> predict function : Accepts pandas.DataFrame and numpy.ndarray. -> predicted values : min = 4.69e-05, mean = 0.439, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.979, mean = -0.000178, max = 0.995 -> model_info : package sklearn A new explainer has been created!
Spróbujemy wyjaśnić decyzję modelu dla dwóch obserwacji a i b ze zbioru testowego. Jedna z nich to klient usatysfakcjonowany, a druga - nie.
a = X_test.iloc[[0]]
b = X_test.iloc[[2]]
exp.predict(a)
array([0.93769557])
exp.predict(b)
array([0.01494634])
break_down_a = exp.predict_parts(a, type='break_down', label="A")
break_down_a.plot()
break_down_b = exp.predict_parts(b, type='break_down', label="B")
break_down_b.plot()
break_down_b.result.sort_values(by='contribution').head(5)
| variable_name | variable_value | variable | cumulative | contribution | sign | position | label | |
|---|---|---|---|---|---|---|---|---|
| 22 | Inflight wifi service | 2.0 | Inflight wifi service = 2.0 | 0.067191 | -0.138199 | -1.0 | 3 | B |
| 21 | Online boarding | 2.0 | Online boarding = 2.0 | 0.205389 | -0.105305 | -1.0 | 4 | B |
| 24 | Customer Type | 0.0 | Customer Type = 0.0 | 0.014946 | -0.039729 | -1.0 | 1 | B |
| 18 | Checkin service | 2.0 | Checkin service = 2.0 | 0.347318 | -0.036996 | -1.0 | 7 | B |
| 15 | Baggage handling | 3.0 | Baggage handling = 3.0 | 0.433603 | -0.027239 | -1.0 | 10 | B |
Część zmiennych, np. inflight wifi service, online_boardnig czy baggage handling w obu przypadkach ma zauważalny wpływ. Pozostałe zmienne są różne.
exp.predict_parts(a, type='shap', label='A').plot()
exp.predict_parts(b, type='shap', label='B').plot()
Aż 6 z 9 zmiennych, które pojawiły się na wykresach, powtarza się. Możemy sądzić, że niektóre udogodnienia w trakcie lotu są ważniejsze niż inne i to one głównie wpływają na satysfakcję klienta.
variables=['Inflight wifi service', 'Online boarding', 'Baggage handling', 'Inflight entertainment']
variable_splits = {
'Inflight wifi service': [1, 2, 3, 4, 5],
'Online boarding': [1, 2, 3, 4, 5],
'Baggage handling': [1, 2, 3, 4, 5],
'Inflight entertainment': [1, 2, 3, 4, 5]
}
cp_a = exp.predict_profile(a, label='A', variables=variables, variable_splits=variable_splits)
cp_b = exp.predict_profile(b, label='B', variables=variables, variable_splits=variable_splits)
cp_a.plot(cp_b)
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 4/4 [00:00<00:00, 85.28it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 4/4 [00:00<00:00, 88.51it/s]
Im wyższa ocena, tym klient jest bardziej usatysfakcjonowany. Jednak poszczególne czynniki zdają się różnić mocą, np. inflight wifi service na poziome 5 dla obserwacji b mógłby sprawić, że klient zmienił by swoją satysfakcję.
vi = exp.model_parts()
vi.plot()
variables=['Inflight wifi service', 'Type of Travel', 'Online boarding', 'Customer Type']
variable_splits = {
'Inflight wifi service': [1, 2, 3, 4, 5],
'Online boarding': [1, 2, 3, 4, 5],
'Type of Travel': [0, 1],
'Customer Type': [0, 1]
}
pdp = exp.model_profile(type='partial', variables=variables, variable_type='categorical', variable_splits=variable_splits)
pdp.plot()
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 4/4 [00:00<00:00, 33.94it/s]
Warte zauważenia jest to, że oceny 1-3 dla sprawdzanych zmiennych "ocenowych" wydają się być jednakowo negatywne.
inflight wifi serviceonline boardingcheckin servicecleanlinessbaggage handlingtype of travel - podróż biznesowa wskazuje na satysfakcję, a osobista na jej brakcustomer type - lojalni kliencji są częsciej usatysfakcjonowani